from matplotlib import pyplot as plt
from tqdm.notebook import tqdm
from pyprojroot import hereRegional models
Forecasts from the ECDC FCH
from datetime import date
min_date = date(2020, 10, 12)
max_date = date(2021, 9, 15)import pandas as pd
def forecast_long_to_wide(df: pd.DataFrame) -> pd.DataFrame:
# Quantile column mapping
forecast_quantiles = [
0.01,
0.025,
*[round(0.05 * q, 3) for q in range(1, 20)],
0.975,
0.99,
]
col_map = {q: f"{q * 100:.1f} %" for q in forecast_quantiles}
# Pivot quantile rows
q = df[df["type"] == "quantile"].copy()
q["quantile"] = pd.to_numeric(q["quantile"])
piv = (
q[q["quantile"].isin(col_map)]
.pivot(index="target_end_date", columns="quantile", values="value")
.rename(columns=col_map)
)
# Add mean and sd (if present)
res = piv.copy()
means = (
df[df["type"] == "mean"][["target_end_date", "value"]]
.drop_duplicates()
.set_index("target_end_date")
.rename(columns={"value": "mean"})
)
res = res.join(means)
# Set final column order and output
cols = ["mean", *col_map.values()]
out = res[cols].reset_index().rename(columns={"target_end_date": "date"})
return outimport os
from pathlib import Path
import subprocess
import pandas as pd
from typing import Literal
def standardize_forecast_dfs(
daily_predictions: pd.DataFrame, level: Literal["country", "county"]
) -> pd.DataFrame:
query_string = (
("target == '1 wk ahead inc case' and (location == 'DE' or location == 'GM')")
if level == "country"
else "target == '1 wk ahead inc case' and location like 'GM%'"
)
wide_df = forecast_long_to_wide(
daily_predictions.query(query_string)[
["target_end_date", "type", "quantile", "value"]
]
)
wide_df["date"] = pd.to_datetime(wide_df["date"], format="%Y-%m-%d").dt.date
return wide_df[(wide_df["date"] >= min_date) & (wide_df["date"] <= max_date)]
def read_fch_submissions(
model_dir: str, repo_url: str, clone_dir: Path
) -> pd.DataFrame:
if not clone_dir.exists():
subprocess.run(
["git", "clone", "--depth", "1", repo_url, str(clone_dir)], check=True
)
csv_dir_path = clone_dir / model_dir
csv_files = [
csv_dir_path / file
for file in os.listdir(csv_dir_path)
if file.endswith(".csv")
]
combined_df = pd.concat(
[
pd.read_csv(file_path).assign(filename=file_path.name)
for file_path in csv_files
],
ignore_index=True,
)
return standardize_forecast_dfs(combined_df)# Define repository and local clone directory
model_names = ["ITWW-county_repro", "EuroCOVIDhub-baseline", "EuroCOVIDhub-ensemble"]
models_ecdc = {
name: read_fch_submissions(
f"data-processed/{name}",
"https://github.com/european-modelling-hubs/covid19-forecast-hub-europe_archive.git",
Path("/tmp/covid19-forecast-hub-europe_archive"),
)
for name in model_names
}
itww_county_repro = models_ecdc["ITWW-county_repro"]
baseline = models_ecdc["EuroCOVIDhub-baseline"]
ensemble = models_ecdc["EuroCOVIDhub-ensemble"]Cloning into '/tmp/covid19-forecast-hub-europe_archive'...
fetch-pack: unexpected disconnect while reading sideband packet
--------------------------------------------------------------------------- KeyboardInterrupt Traceback (most recent call last) Cell In[3], line 4 1 # Define repository and local clone directory 2 model_names = ["ITWW-county_repro", "EuroCOVIDhub-baseline", "EuroCOVIDhub-ensemble"] ----> 4 models_ecdc = { 5 name: read_fch_submissions( 6 f"data-processed/{name}", 7 "https://github.com/european-modelling-hubs/covid19-forecast-hub-europe_archive.git", 8 Path("/tmp/covid19-forecast-hub-europe_archive"), 9 ) 10 for name in model_names 11 } 13 itww_county_repro = models_ecdc["ITWW-county_repro"] 14 baseline = models_ecdc["EuroCOVIDhub-baseline"] Cell In[3], line 5, in <dictcomp>(.0) 1 # Define repository and local clone directory 2 model_names = ["ITWW-county_repro", "EuroCOVIDhub-baseline", "EuroCOVIDhub-ensemble"] 4 models_ecdc = { ----> 5 name: read_fch_submissions( 6 f"data-processed/{name}", 7 "https://github.com/european-modelling-hubs/covid19-forecast-hub-europe_archive.git", 8 Path("/tmp/covid19-forecast-hub-europe_archive"), 9 ) 10 for name in model_names 11 } 13 itww_county_repro = models_ecdc["ITWW-county_repro"] 14 baseline = models_ecdc["EuroCOVIDhub-baseline"] Cell In[2], line 33, in read_fch_submissions(model_dir, repo_url, clone_dir) 29 def read_fch_submissions( 30 model_dir: str, repo_url: str, clone_dir: Path 31 ) -> pd.DataFrame: 32 if not clone_dir.exists(): ---> 33 subprocess.run( 34 ["git", "clone", "--depth", "1", repo_url, str(clone_dir)], check=True 35 ) 37 csv_dir_path = clone_dir / model_dir 38 csv_files = [ 39 csv_dir_path / file 40 for file in os.listdir(csv_dir_path) 41 if file.endswith(".csv") 42 ] File ~/.local/share/uv/python/cpython-3.10.17-macos-aarch64-none/lib/python3.10/subprocess.py:505, in run(input, capture_output, timeout, check, *popenargs, **kwargs) 503 with Popen(*popenargs, **kwargs) as process: 504 try: --> 505 stdout, stderr = process.communicate(input, timeout=timeout) 506 except TimeoutExpired as exc: 507 process.kill() File ~/.local/share/uv/python/cpython-3.10.17-macos-aarch64-none/lib/python3.10/subprocess.py:1146, in Popen.communicate(self, input, timeout) 1144 stderr = self.stderr.read() 1145 self.stderr.close() -> 1146 self.wait() 1147 else: 1148 if timeout is not None: File ~/.local/share/uv/python/cpython-3.10.17-macos-aarch64-none/lib/python3.10/subprocess.py:1209, in Popen.wait(self, timeout) 1207 endtime = _time() + timeout 1208 try: -> 1209 return self._wait(timeout=timeout) 1210 except KeyboardInterrupt: 1211 # https://bugs.python.org/issue25942 1212 # The first keyboard interrupt waits briefly for the child to 1213 # exit under the common assumption that it also received the ^C 1214 # generated SIGINT and will exit rapidly. 1215 if timeout is not None: File ~/.local/share/uv/python/cpython-3.10.17-macos-aarch64-none/lib/python3.10/subprocess.py:1959, in Popen._wait(self, timeout) 1957 if self.returncode is not None: 1958 break # Another thread waited. -> 1959 (pid, sts) = self._try_wait(0) 1960 # Check the pid and loop as waitpid has been known to 1961 # return 0 even without WNOHANG in odd situations. 1962 # http://bugs.python.org/issue14396. 1963 if pid == self.pid: File ~/.local/share/uv/python/cpython-3.10.17-macos-aarch64-none/lib/python3.10/subprocess.py:1917, in Popen._try_wait(self, wait_flags) 1915 """All callers to this function MUST hold self._waitpid_lock.""" 1916 try: -> 1917 (pid, sts) = os.waitpid(self.pid, wait_flags) 1918 except ChildProcessError: 1919 # This happens if SIGCLD is set to be ignored or waiting 1920 # for child processes has otherwise been disabled for our 1921 # process. This child is dead, we can't get the status. 1922 pid = self.pid KeyboardInterrupt:
for model, df in models_ecdc.items():
df.to_csv(here(f"data/processed/ECDC_FCH_{model}.csv"), index=False)Forecasts from the German FCH
from datetime import timedelta
fch_repo_url = "https://github.com/KITmetricslab/covid19-forecast-hub-de"
def fch_data_url(model: str, forecast_date: date) -> str:
submission_date = (forecast_date + timedelta(days=-5)).strftime("%Y-%m-%d")
return f"https://raw.githubusercontent.com/KITmetricslab/covid19-forecast-hub-de/refs/heads/master/data-processed/{model}/{submission_date}-Germany-{model}-case.csv"
forecast_dates = pd.date_range(
start=date(2020, 10, 12), end=date(2021, 9, 15), freq="W-SAT"
).date.tolist()
fch_models = ["KIT-baseline", "ITWW-county_repro", "KITCOVIDhub-median_ensemble"]
fch_dfs = {
model: pd.concat(
[
pd.read_csv(fch_data_url(model, fd))
for fd in tqdm(forecast_dates, desc=f"Reading {model}")
]
)
for model in fch_models
}--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[3], line 12 8 submission_date = (forecast_date + timedelta(days=-5)).strftime("%Y-%m-%d") 9 return f"https://raw.githubusercontent.com/KITmetricslab/covid19-forecast-hub-de/refs/heads/master/data-processed/{model}/{submission_date}-Germany-{model}-case.csv" ---> 12 forecast_dates = pd.date_range( 13 start=date(2020, 10, 12), end=date(2021, 9, 15), freq="W-SAT" 14 ).date.tolist() 16 fch_models = ["KIT-baseline", "ITWW-county_repro", "KITCOVIDhub-median_ensemble"] 17 fch_dfs = { 18 model: pd.concat( 19 [ (...) 24 for model in fch_models 25 } NameError: name 'pd' is not defined
standardize_forecast_dfs(fch_dfs["KITCOVIDhub-median_ensemble"])| date | mean | 1.0 % | 2.5 % | 5.0 % | 10.0 % | 15.0 % | 20.0 % | 25.0 % | 30.0 % | ... | 60.0 % | 65.0 % | 70.0 % | 75.0 % | 80.0 % | 85.0 % | 90.0 % | 95.0 % | 97.5 % | 99.0 % | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2020-10-17 | NaN | 21539.423788 | 22569.355225 | 23568.090312 | 24586.979322 | 25139.325838 | 25561.499631 | 25932.539485 | 26277.073961 | ... | 28591.117862 | 29302.140058 | 30152.710629 | 31079.838904 | 32123.809361 | 33356.524172 | 34932.633915 | 37322.459967 | 39450.101466 | 41991.884426 |
| 1 | 2020-10-24 | NaN | 36259.974108 | 40010.212500 | 43312.925000 | 47192.300000 | 49605.825000 | 51872.358536 | 52952.611707 | 53923.109962 | ... | 58291.199381 | 58716.924259 | 59153.412378 | 59671.920685 | 60831.195953 | 62635.234335 | 64701.151489 | 67693.593562 | 70602.392253 | 74215.002332 |
| 2 | 2020-10-31 | NaN | 71773.589517 | 73638.165669 | 75280.502599 | 77219.523177 | 79654.000000 | 83829.000000 | 87584.000000 | 90963.000000 | ... | 107475.000000 | 110180.000000 | 112972.000000 | 116318.000000 | 119771.000000 | 123925.000000 | 126976.861592 | 130891.717572 | 133932.255716 | 138512.637213 |
| 3 | 2020-11-07 | NaN | 128928.056101 | 131639.839424 | 134138.933859 | 138871.261618 | 141210.839387 | 143696.640767 | 145158.876872 | 146898.054911 | ... | 153972.172672 | 155487.580999 | 156963.110160 | 159133.790947 | 163005.450209 | 167518.341931 | 173196.590203 | 181612.609510 | 188912.251145 | 197399.662078 |
| 4 | 2020-11-14 | NaN | 119952.026330 | 129980.150000 | 136067.000000 | 137503.000000 | 138904.000000 | 140194.000000 | 141949.000000 | 147334.000000 | ... | 173572.000000 | 177833.000000 | 182631.000000 | 187491.000000 | 193217.000000 | 199890.000000 | 207928.000000 | 219977.000000 | 231903.000000 | 246066.000000 |
| 5 | 2020-11-21 | NaN | 97396.242482 | 104060.481932 | 110108.458379 | 115683.488800 | 119223.418142 | 120945.000000 | 122059.000000 | 123346.000000 | ... | 136235.901986 | 139750.219440 | 146430.900000 | 153778.250000 | 163680.600000 | 170636.000000 | 177966.000000 | 187194.000000 | 197020.000000 | 208404.000000 |
| 6 | 2020-11-28 | NaN | 75040.120473 | 82538.858916 | 89002.873287 | 97030.960444 | 102229.248647 | 106634.864174 | 110372.743812 | 113812.616748 | ... | 129905.667002 | 132834.295917 | 137257.829453 | 143179.904368 | 147545.934159 | 152884.334404 | 159796.770812 | 170003.316064 | 179230.239236 | 190778.863126 |
| 7 | 2020-12-05 | NaN | 99073.714364 | 103654.377174 | 106429.500825 | 110397.726206 | 113170.402772 | 115296.137416 | 117413.526398 | 118612.429230 | ... | 123892.058085 | 124821.077343 | 125814.051913 | 129138.353260 | 130552.346895 | 132309.689481 | 134185.351891 | 136810.836639 | 138873.171844 | 144911.747438 |
| 8 | 2020-12-12 | NaN | 102028.092074 | 106971.000000 | 110320.900000 | 115605.200000 | 118121.700000 | 120789.500000 | 123099.200000 | 125180.900000 | ... | 136966.944209 | 137670.389527 | 141174.700000 | 145046.100000 | 147937.200000 | 151471.400000 | 154830.700000 | 162936.000000 | 168541.900000 | 174167.200000 |
| 9 | 2020-12-19 | NaN | 118050.856771 | 123185.524846 | 127674.122385 | 132990.545650 | 136580.749582 | 139517.928723 | 141841.483594 | 143634.146435 | ... | 163604.357360 | 166131.343093 | 170999.825897 | 179230.857426 | 181605.922234 | 186100.779239 | 189526.778717 | 197975.255846 | 201728.712142 | 205654.302086 |
| 10 | 2020-12-26 | NaN | 148252.940110 | 155455.602883 | 159761.985631 | 164009.878798 | 166702.444583 | 170163.928050 | 174883.302868 | 179017.020334 | ... | 195915.843768 | 200000.375000 | 206957.050000 | 209845.350721 | 213586.108320 | 218212.550535 | 224385.260867 | 234773.215769 | 242729.132301 | 253027.603250 |
| 11 | 2021-01-02 | NaN | 132075.689996 | 136473.511807 | 140014.768738 | 145177.816583 | 147687.264178 | 149729.869615 | 151226.278139 | 154574.085135 | ... | 178937.000000 | 182924.500000 | 188915.000000 | 194982.000000 | 201024.000000 | 214203.000000 | 221485.500000 | 229500.050000 | 237832.600000 | 244060.800000 |
| 12 | 2021-01-09 | NaN | 70866.000000 | 73220.000000 | 75230.970694 | 89936.632948 | 99858.473456 | 107744.037762 | 110582.000000 | 113713.000000 | ... | 134039.000000 | 136517.168760 | 138530.218154 | 141134.250000 | 143679.000000 | 147026.000000 | 153446.000000 | 157519.000000 | 168203.000000 | 174302.000000 |
| 13 | 2021-01-16 | NaN | 108226.842253 | 116490.412037 | 120290.786069 | 123473.865113 | 125682.614486 | 127926.356360 | 131371.871120 | 133982.655333 | ... | 151166.717929 | 154642.301674 | 158108.866648 | 163738.650933 | 175791.735578 | 182563.520762 | 187019.207026 | 194751.634072 | 208596.830790 | 222159.902957 |
| 14 | 2021-01-23 | NaN | 90075.738919 | 91272.394498 | 92656.279309 | 97069.706332 | 99886.312797 | 101186.952415 | 102671.828385 | 104347.425973 | ... | 117602.352819 | 122909.187183 | 128818.443137 | 133090.953468 | 137862.039075 | 143441.468364 | 150045.428596 | 153450.667358 | 159628.766128 | 166703.621357 |
| 15 | 2021-01-30 | NaN | 65036.947862 | 68207.400000 | 70682.000000 | 75393.000000 | 77368.000000 | 79157.000000 | 81480.943035 | 83039.536978 | ... | 92997.600000 | 98667.660132 | 100923.091691 | 103357.056965 | 106067.392719 | 109226.620058 | 113201.649437 | 119093.241773 | 124203.319488 | 130144.889176 |
| 16 | 2021-02-06 | NaN | 49254.540013 | 51787.469208 | 54078.526834 | 56861.941243 | 60818.304772 | 63168.000000 | 64366.000000 | 65382.000000 | ... | 73209.000000 | 74270.955065 | 75502.550201 | 77455.000000 | 79177.000000 | 81298.000000 | 83850.000000 | 88527.601794 | 93628.147307 | 101838.781549 |
| 17 | 2021-02-13 | NaN | 37152.562107 | 42472.500000 | 46084.000000 | 48519.000000 | 50146.000000 | 51523.500000 | 52761.000000 | 53803.000000 | ... | 60470.000000 | 61562.500000 | 62734.500000 | 65672.473412 | 66966.335296 | 67612.915186 | 68556.088649 | 71154.904743 | 74633.539985 | 77320.857092 |
| 18 | 2021-02-20 | NaN | 29131.745604 | 30745.834812 | 33059.338448 | 35137.000000 | 36607.000000 | 37765.500000 | 38735.500000 | 39764.000000 | ... | 44152.397716 | 44896.039654 | 45691.950000 | 47712.500000 | 48803.500000 | 50149.500000 | 51187.489007 | 52538.011199 | 53841.000154 | 55551.777654 |
| 19 | 2021-02-27 | NaN | 32440.608145 | 34672.594315 | 36640.027133 | 38773.633470 | 40211.763804 | 41419.541312 | 43029.110674 | 44420.979548 | ... | 51590.496672 | 52336.586061 | 52842.945027 | 53569.401678 | 54279.415587 | 55359.038862 | 57116.031617 | 60116.754495 | 63417.636241 | 67299.315994 |
| 20 | 2021-03-06 | NaN | 38867.021435 | 41970.813456 | 44609.398411 | 47697.406302 | 49759.328652 | 51298.660577 | 52692.324868 | 54014.288939 | ... | 62707.542810 | 64158.636494 | 65580.181268 | 67697.940960 | 69721.834461 | 72136.862278 | 75053.685321 | 78083.349162 | 81543.116776 | 84947.224981 |
| 21 | 2021-03-13 | NaN | 43295.054531 | 45915.936245 | 48431.183635 | 51720.209739 | 53141.890491 | 54588.783254 | 56474.697103 | 57213.803903 | ... | 63459.833764 | 64187.616792 | 64898.389810 | 67452.250000 | 69625.700000 | 71560.100000 | 74920.668641 | 77019.056638 | 78236.492380 | 79623.940550 |
| 22 | 2021-03-20 | NaN | 59698.663030 | 62695.074834 | 65462.752913 | 68562.094894 | 70906.615580 | 72896.598992 | 74840.741240 | 76018.814346 | ... | 83223.926699 | 84573.455680 | 85900.111288 | 87375.443817 | 89225.215183 | 91389.720700 | 94271.765642 | 103324.000000 | 105626.000000 | 108030.099231 |
| 23 | 2021-03-27 | NaN | 91629.429609 | 95731.128826 | 97899.520468 | 101485.813628 | 104342.062915 | 105814.731737 | 107964.297444 | 109957.877910 | ... | 119970.956952 | 122231.000000 | 124409.500000 | 127860.750000 | 131525.526774 | 133654.754890 | 137138.000000 | 142763.000000 | 147123.000000 | 152442.046481 |
| 24 | 2021-04-03 | NaN | 101229.133062 | 105019.256629 | 108412.163841 | 114161.000000 | 115579.000000 | 119702.471564 | 124320.023278 | 128289.961503 | ... | 139738.760480 | 141280.649613 | 142541.000000 | 144406.751030 | 147532.852447 | 149964.264660 | 154913.925237 | 161930.286195 | 166704.806187 | 169925.840358 |
25 rows × 25 columns
for model, df in fch_dfs.items():
standardize_forecast_dfs(df).to_csv(
here(f"data/processed/KIT_FCH_{model}.csv"), index=False
)RKI truth
truth = (
pd.read_csv(here("data/processed/RKI_county_weekly.csv"))
.groupby("date")
.agg(cases=("cases", "sum"))
)
truth["date"] = pd.to_datetime(truth.index, format="%Y-%m-%d").date
truth.reset_index(drop=True, inplace=True)truth = truth[(truth["date"] >= min_date) & (truth["date"] <= max_date)]assert all(truth.date.values == itww_county_repro.date.values)
assert all(truth.date.values == baseline.date.values)
assert all(truth.date.values == ensemble.date.values)fig, axs = plt.subplots(3, figsize=(10, 10), sharex=True)
for ax, model_name in zip(axs, model_names):
df = models[model_name]
ax.plot(df.date, df["50.0 %"])
ax.fill_between(df.date, df["25.0 %"], df["75.0 %"], color="gray", alpha=0.5)
ax.fill_between(df.date, df["2.5 %"], df["97.5 %"], color="lightgray", alpha=0.2)
ax.set_title(model_name)
ax.plot(truth["date"], truth["cases"], color="black")
ax.set_ylabel("Cases")
ax.set_xlabel("Date")
plt.tight_layout()
plt.show()
"KITCOVIDhub-median_ensemble"import pandas as pd
import requests
from typing import Optional
from tqdm.notebook import tqdm
def read_csvs_from_github_dir(
owner: str,
repo: str,
path: str,
branch: str = "main",
token: Optional[str] = None,
) -> pd.DataFrame:
"""
Read all CSV files from a GitHub directory and concatenate them.
Args:
owner: GitHub repository owner
repo: Repository name
path: Directory path within the repo
branch: Branch name (default: "main")
token: Optional GitHub token for private repos or rate limiting
Returns:
Concatenated DataFrame of all CSV files
"""
# Get directory contents via GitHub API
api_url = (
f"https://api.github.com/repos/{owner}/{repo}/contents/{path}?ref={branch}"
)
headers = {"Authorization": f"token {token}"} if token else {}
response = requests.get(api_url, headers=headers)
response.raise_for_status()
files = response.json()
csv_files = [f for f in files if f["name"].endswith(".csv")]
# Read each CSV from raw content URL
dfs = []
for file_info in tqdm(csv_files):
raw_url = file_info["download_url"]
df = pd.read_csv(raw_url)
dfs.append(df)
return pd.concat(dfs, ignore_index=True)# For your forecast hub case:
df = read_csvs_from_github_dir(
owner="KITmetricslab",
repo="covid19-forecast-hub-de",
path="data-processed/KIT-baseline",
branch="master",
)df[(df.target == "1 wk ahead inc case") * (df.location.str.match(r"GM\d\d"))]| forecast_date | target | target_end_date | location | type | quantile | value | location_name | |
|---|---|---|---|---|---|---|---|---|
| 198 | 2020-05-04 | 1 wk ahead inc case | 2020-05-09 | GM01 | point | NaN | 1900.0 | Baden-Württemberg State |
| 202 | 2020-05-04 | 1 wk ahead inc case | 2020-05-09 | GM01 | quantile | 0.010 | 743.0 | Baden-Württemberg State |
| 203 | 2020-05-04 | 1 wk ahead inc case | 2020-05-09 | GM01 | quantile | 0.025 | 878.0 | Baden-Württemberg State |
| 204 | 2020-05-04 | 1 wk ahead inc case | 2020-05-09 | GM01 | quantile | 0.050 | 1007.0 | Baden-Württemberg State |
| 205 | 2020-05-04 | 1 wk ahead inc case | 2020-05-09 | GM01 | quantile | 0.100 | 1172.0 | Baden-Württemberg State |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 886728 | 2022-01-17 | 1 wk ahead inc case | 2022-01-22 | GM16 | quantile | 0.850 | 57541.0 | Berlin State |
| 886729 | 2022-01-17 | 1 wk ahead inc case | 2022-01-22 | GM16 | quantile | 0.900 | 64056.0 | Berlin State |
| 886730 | 2022-01-17 | 1 wk ahead inc case | 2022-01-22 | GM16 | quantile | 0.950 | 74546.0 | Berlin State |
| 886731 | 2022-01-17 | 1 wk ahead inc case | 2022-01-22 | GM16 | quantile | 0.975 | 84472.0 | Berlin State |
| 886732 | 2022-01-17 | 1 wk ahead inc case | 2022-01-22 | GM16 | quantile | 0.990 | 97002.0 | Berlin State |
32256 rows × 8 columns
gm_to_ags = pd.DataFrame(
{
"gm": [f"GM{n:02d}" for n in range(1, 17)],
"ags": [
8,
9,
4,
2,
6,
3,
5,
7,
10,
1,
12,
13,
14,
15,
16,
11,
],
}
)gm_dict = (
df[df.location.str.match(r"GM\d\d")][["location", "location_name"]]
.drop_duplicates()
.reset_index(drop=True)
)ags_dict = pd.read_csv(here("data/processed/ags_state_dict.csv"))
pd.merge(
pd.merge(ags_dict, gm_to_ags, on="ags"), gm_dict, left_on="gm", right_on="location"
)| ags | name | gm | location | location_name | |
|---|---|---|---|---|---|
| 0 | 1 | Schleswig-Holstein | GM10 | GM10 | Schleswig-Holstein State |
| 1 | 2 | Hamburg | GM04 | GM04 | Free Hanseatic City of Hamburg |
| 2 | 3 | Niedersachsen | GM06 | GM06 | Lower Saxony State |
| 3 | 4 | Bremen | GM03 | GM03 | Free Hanseatic City of Bremen |
| 4 | 5 | Nordrhein-Westfalen | GM07 | GM07 | North Rhine-Westphalia State |
| 5 | 6 | Hessen | GM05 | GM05 | Hesse State |
| 6 | 7 | Rheinland-Pfalz | GM08 | GM08 | Rhineland-Palatinate State |
| 7 | 8 | Baden-Württemberg | GM01 | GM01 | Baden-Württemberg State |
| 8 | 9 | Bayern | GM02 | GM02 | Free State of Bavaria |
| 9 | 10 | Saarland | GM09 | GM09 | Saarland State |
| 10 | 11 | Berlin | GM16 | GM16 | Berlin State |
| 11 | 12 | Brandenburg | GM11 | GM11 | Brandenburg State |
| 12 | 13 | Mecklenburg-Vorpommern | GM12 | GM12 | Mecklenburg-Western Pomerania State |
| 13 | 14 | Sachsen | GM13 | GM13 | Free State of Saxony |
| 14 | 15 | Sachsen-Anhalt | GM14 | GM14 | Sachsen-Anhalt State |
| 15 | 16 | Thüringen | GM15 | GM15 | Free State of Thüringia |
| 16 | 16 | Thüringen | GM15 | GM15 | Free State of Thuringia |